# encoding:utf-8

"""
@version: python2.7
@author : 'l00383533'
@file   : multispider.py
@time   : 2017/3/29 11:11
@todo   : 多线程处理的爬虫
"""
import thread

import time

from com.luodongseu.conf.propertiseman import getValue
from com.luodongseu.spiderman import downloader
from com.luodongseu.spiderman import parserman
from com.luodongseu.spiderman import urlmanager

down = downloader.Downloader()
urlman = urlmanager.UrlManager()
parser = parserman.ParserMan()

baseUrl = getValue('base_url')

threadLock = thread.allocate_lock()  # 获取线程锁


def crawl(url):
    html = ""
    try:
        html = down.download(url)
    except Exception as e:
        print e
    # 处理网页，返回有用的数据和与url
    urls, data = parser.parser(html, baseUrl)
    # 添加url到url管理器中 这里需要添加同步锁
    threadLock.acquire()  # 捕获锁
    urlman.add_new_urls(urls)
    print "url 个数： " + str(urlman.len_of_new_urls())
    threadLock.release()  # 释放锁


for i in range(259200, 259230):
    print "============Start find question url: " + baseUrl + "?page=<" + str(i) + ">=============="
    url = baseUrl + "?page=" + str(i)
    thread.start_new_thread(crawl, (url,))  # 创建线程

time.sleep(20)  # 不加这个会出现sys.excepthook is missing异常
